#!/usr/bin/env python
# coding: utf-8

# In[2]:


import pandas as pd
import matplotlib.pyplot as plt


# In[30]:



# Load data
folder = "/Users/ling/Documents/research/papers/Scientific Data_LinzhuoLi/UploadToHarvardDataverse/"
df = pd.read_csv(folder + "Simonton_Multiple_Discoveries_Dataset.csv")

# Make sure Year is numeric
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")

# Keep one row per discovery event
events = df[["Discovery", "Year"]].drop_duplicates().dropna()

# Count unique discovery events by year
year_counts = events.groupby("Year").size()


# Focus on the last 500 years / dense period
year_counts = year_counts[year_counts.index >= 1500]

# Plot
plt.figure(figsize=(12, 4))
plt.bar(year_counts.index, year_counts.values, width=1.0, color="SteelBlue")

plt.xlabel("Year", fontsize=14)
plt.ylabel("Number of independent discoveries", fontsize=14)
plt.title("Temporal distribution of independent discoveries, 1491–1991", fontsize=16)

# Clean style
ax = plt.gca()
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
plt.grid(axis="y", linestyle="--", alpha=0.4)
plt.tick_params(axis='both', labelsize=12)

plt.tight_layout()

# Save figure
plt.savefig(folder + "Figure2_temporal_distribution_1491_1991.png", dpi=300, bbox_inches="tight")

plt.show()


# In[54]:


import pandas as pd
import matplotlib.pyplot as plt

# Load data
folder = "/Users/ling/Documents/research/papers/Scientific Data_LinzhuoLi/UploadToHarvardDataverse/"
df = pd.read_csv(folder + "Simonton_Multiple_Discoveries_Dataset.csv")

# Clean nationality column
df["Nationality"] = df["Nationality"].astype("string").str.strip()
df["Nationality"] = df["Nationality"].replace(
    {"": pd.NA, "NA": pd.NA, "<NA>": pd.NA, "nan": pd.NA, "NaN": pd.NA, "None": pd.NA}
)

# Counts
disc_counts = df["Discipline"].value_counts().sort_values(ascending=True)
country_counts = (
    df["Nationality"]
    .dropna()
    .value_counts()
    .head(10)
    .sort_values(ascending=True)
)

# Style settings
tick_size = 12
label_size = 14
title_size = 16
panel_title_size = 14
value_size = 12

# Create two-panel figure
fig, axes = plt.subplots(1, 2, figsize=(12, 5), constrained_layout=True)

# Panel A: Discipline
ax = axes[0]
bars1 = ax.barh(disc_counts.index, disc_counts.values,height=0.4, color="#F5A000")
ax.set_xlim(0, disc_counts.max()*1.2)

for bar in bars1:
    width = bar.get_width()
    ax.text(width + 5,
            bar.get_y() + bar.get_height() / 2,
            f"{int(width)}",
            va="center",
            fontsize=value_size)

ax.set_title("A. Scientific discipline", fontsize=panel_title_size)
ax.set_xlabel("Number of independent discoveries", fontsize=label_size)
ax.set_ylabel("")
ax.tick_params(axis="both", labelsize=tick_size)
ax.grid(axis="x", linestyle="--", alpha=0.4)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

# Panel B: Nationality
ax = axes[1]
bars2 = ax.barh(country_counts.index, country_counts.values,height=0.6, color="Brown")
ax.set_xlim(0, country_counts.max()*1.2)

for bar in bars2:
    width = bar.get_width()
    ax.text(width + 5,
            bar.get_y() + bar.get_height() / 2,
            f"{int(width)}",
            va="center",
            fontsize=value_size)

ax.set_title("B. Scientist nationality", fontsize=panel_title_size)
ax.set_xlabel("Number of independent discoveries", fontsize=label_size)
ax.set_ylabel("")
ax.tick_params(axis="both", labelsize=tick_size)
ax.grid(axis="x", linestyle="--", alpha=0.4)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

# Overall title
fig.suptitle(
    "Distribution of independent discoveries by discipline and scientist nationality",
    fontsize=title_size
)

# Save figure
plt.savefig(folder + "Figure3_discipline_nationality_distribution.png", dpi=300, bbox_inches="tight")
plt.show()


# In[ ]:





# In[ ]:




